Geometry Operations and Spatial Analysis#

Setup#

from sqlalchemy import create_engine
import geopandas as gpd
import pandas as pd
import requests
import shutil
import folium
import json
from pathlib import Path
engine_str = "postgresql+psycopg2://docker:docker@0.0.0.0:25432/restaurants"
engine = create_engine(engine_str)
%load_ext sql
%sql $engine.url
'Connected: docker@restaurants'
QUEENS = [40.7292, -73.9049]
def to_map(result_set, location=QUEENS, geom_col='geom', epsg=4326, zoom=13, fillcolor='orange'):
    m = folium.Map(location=location, zoom_start=zoom)
    df = result_set.DataFrame()
    gdf = gpd.GeoDataFrame(df, geometry=gpd.GeoSeries.from_wkb(df[geom_col], crs=epsg))

    if epsg != 4326:
        gdf.to_crs(4326, inplace=True)

    geojson = json.loads(gdf.to_json())

    for r in geojson['features']:

        if 'properties' in r.keys():
            properties = r['properties']
        else:
            properties = None

        popup = folium.GeoJsonPopup([c for c in properties.keys() if geom_col not in c.lower()]) if properties is not None else None
        geo_j = folium.GeoJson(data=r,
                                style_function=lambda x: {'fillColor': fillcolor},
                                popup=popup
        )
        geo_j.add_to(m)

    return m

Import & Join#

NY State Census Block Groups#

retrieved from census.gov

ny_cb = 'cb_2017_36_bg_500k'
%%bash -s "$ny_cb"
unzip -o "../data/$1.zip" -d "../data/tmp/$1"
gdalsrsinfo -e "../data/tmp/$1/$1.shp" | grep "^EPSG"
Archive:  ../data/cb_2017_36_bg_500k.zip
  inflating: ../data/tmp/cb_2017_36_bg_500k/cb_2017_36_bg_500k.shp.ea.iso.xml  
  inflating: ../data/tmp/cb_2017_36_bg_500k/cb_2017_36_bg_500k.shp.iso.xml  
  inflating: ../data/tmp/cb_2017_36_bg_500k/cb_2017_36_bg_500k.shp.xml  
  inflating: ../data/tmp/cb_2017_36_bg_500k/cb_2017_36_bg_500k.shp  
  inflating: ../data/tmp/cb_2017_36_bg_500k/cb_2017_36_bg_500k.shx  
  inflating: ../data/tmp/cb_2017_36_bg_500k/cb_2017_36_bg_500k.dbf  
  inflating: ../data/tmp/cb_2017_36_bg_500k/cb_2017_36_bg_500k.prj  
 extracting: ../data/tmp/cb_2017_36_bg_500k/cb_2017_36_bg_500k.cpg  
EPSG:4269
%%bash -s "$ny_cb"
ogr2ogr -f "PostgreSQL" \
 PG:"host='0.0.0.0' port='25432' user='docker' password='docker' dbname='restaurants'" "../data/tmp/$1/$1.shp" \
 -nlt PROMOTE_TO_MULTI \
 -nln "census_block_ny" \
 -lco GEOMETRY_NAME=geom \
 -a_srs "EPSG:4269" \
 -overwrite 

American Community Survey Population#

acs_zip = '../data/Queens_Education_Attainment_ACS_17_5YR_B15003.zip' 
acs =  '../data/tmp/Queens_Education_Attainment_ACS_17_5YR_B15003'
%%bash -s "$acs_zip" "$acs"
unzip -o $1 -d $2
Archive:  ../data/Queens_Education_Attainment_ACS_17_5YR_B15003.zip
  inflating: ../data/tmp/Queens_Education_Attainment_ACS_17_5YR_B15003/ACS_17_5YR_B15003.csv  
  inflating: ../data/tmp/Queens_Education_Attainment_ACS_17_5YR_B15003/ACS_17_5YR_B15003_metadata.csv  
  inflating: ../data/tmp/Queens_Education_Attainment_ACS_17_5YR_B15003/ACS_17_5YR_B15003.txt  
  inflating: ../data/tmp/Queens_Education_Attainment_ACS_17_5YR_B15003/aff_download_readme.txt  

Let’s save a little time and import the data via pandas rather than manually creating a table schema and using a COPY statement.

We are only interested in these Population aggregates from the metadata:

    HD01_VD01,Estimate; Total:
    HD01_VD22,Estimate; Total: - Bachelor's degree
    HD01_VD23,Estimate; Total: - Master's degree
    HD01_VD24,Estimate; Total: - Professional school degree
    HD01_VD25,Estimate; Total: - Doctorate degree
acs_df = pd.read_csv(
    f"{acs}/ACS_17_5YR_B15003.csv",
    usecols=[
        "GEO.id",
        "GEO.id2",
        "GEO.display-label",
        "HD01_VD01",
        "HD01_VD22",
        "HD01_VD23",
        "HD01_VD24",
        "HD01_VD25",
    ],
)
# Give friendly names
acs_df.rename(
    columns={
        "GEO.id": "geo_id_1",
        "GEO.id2": "geo_id_2",
        "GEO.display-label": "name",
        "HD01_VD01": "pop_total",
        "HD01_VD22": "pop_bachelors",
        "HD01_VD23": "pop_masters",
        "HD01_VD24": "pop_professional_school",
        "HD01_VD25": "pop_doctorate",
    },
    inplace=True,
)
with engine.connect() as con:
    acs_df.to_sql('queens_acs', con, if_exists='replace', index=False)
%sql SELECT geo_id_1 FROM queens_acs LIMIT 5;
 * postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
5 rows affected.
geo_id_1
1500000US360810001001
1500000US360810002001
1500000US360810002002
1500000US360810004001
1500000US360810004002
%sql SELECT affgeoid FROM census_block_ny LIMIT 5;
 * postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
5 rows affected.
affgeoid
1500000US360550116014
1500000US360810273005
1500000US360470336001
1500000US360470355001
1500000US361219707003
%%sql
DROP TABLE IF EXISTS queens_census_pop;

CREATE TABLE queens_census_pop AS
    SELECT geo_id_1 as geoid,
        queens_acs.name, 
        pop_total,
        pop_bachelors,
        pop_masters,
        pop_professional_school,
        pop_doctorate,
        geom 
    FROM queens_acs 
    LEFT JOIN census_block_ny
    ON queens_acs.geo_id_1 = census_block_ny.affgeoid
    WHERE geom IS NOT NULL;

CREATE INDEX IF NOT EXISTS queens_census_pop_geom_idx ON queens_census_pop USING gist(geom);
 * postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
Done.
1739 rows affected.
Done.
[]
%%sql queens_cb_pop <<
SELECT * FROM queens_census_pop;
 * postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
1739 rows affected.
Returning data to local variable queens_cb_pop

Click on polygons to view data

to_map(queens_cb_pop)
Make this Notebook Trusted to load map: File -> Trust Notebook

Add geography columns to facilitate proximity analysis via ST_Buffer

%%sql
-- nyc_subway_stations
ALTER TABLE nyc_subway_stations
ADD COLUMN IF NOT EXISTS geog geography;

UPDATE nyc_subway_stations
SET geog = ST_Transform(geom, 4326)::geography;

CREATE INDEX IF NOT EXISTS nyc_subway_stations_geog_idx ON nyc_subway_stations USING gist(geog);

-- queens_census_pop
ALTER TABLE queens_census_pop
ADD COLUMN IF NOT EXISTS geog geography;

UPDATE queens_census_pop
SET geog = ST_Transform(geom, 4326)::geography;

CREATE INDEX IF NOT EXISTS queens_census_pop_geog_idx ON queens_census_pop USING gist(geog);
 * postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
Done.
473 rows affected.
Done.
Done.
1739 rows affected.
Done.
[]

Spatial Analysis#

Find the total population, population with a bachelor’s degree or higher, and the percentage of those with a bachelor’s degree or higher within 200 and 500 meters from every subway station in Queens.

We will make use of of the ST_Intersection and ST_Buffer to calculate a “proportion of population” assuming the population is equally distributed within the census blocks.

Let’s just add those buffers directly to the nyc_subway_stations table

    %%sql
    ALTER TABLE nyc_subway_stations
    ADD COLUMN IF NOT EXISTS buffer_500 geography;

    ALTER TABLE nyc_subway_stations
    ADD COLUMN IF NOT EXISTS buffer_200 geography;

    UPDATE nyc_subway_stations
    SET buffer_500 = ST_Buffer(geog, 500);

    UPDATE nyc_subway_stations
    SET buffer_200 = ST_Buffer(geog, 200);
 * postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
Done.
Done.
473 rows affected.
473 rows affected.
[]
%%sql subway_buffer_pop <<
WITH subq1 AS (
    SELECT 
    nss.ogc_fid,
    nss.name, 
    nss.line,
    qcp.geoid,
    qcp.pop_total,
    qcp.pop_bachelors + qcp.pop_masters + qcp.pop_professional_school + qcp.pop_doctorate as pop_bachelors_plus,
    ST_Area(qcp.geog) as cb_area_total,
    ST_Area(ST_Intersection(qcp.geog, nss.buffer_500)) as cb_area_in_buffer500,
    ST_Area(ST_Intersection(qcp.geog, nss.buffer_200)) as cb_area_in_buffer200,
    nss.geom
    FROM queens_census_pop qcp
    JOIN nyc_subway_stations nss 
    ON ST_INTERSECTS(qcp.geog, nss.buffer_500)
),
subq2 AS(
    SELECT
    ogc_fid,
    name,
    line,
    pop_total * (cb_area_in_buffer500 / cb_area_total) as pop_total_in_buffer500,
    pop_total * (cb_area_in_buffer200 / cb_area_total) as pop_total_in_buffer200,
    pop_bachelors_plus * (cb_area_in_buffer500 / cb_area_total) as pop_bachelors_plus_in_buffer500,
    pop_bachelors_plus * (cb_area_in_buffer200 / cb_area_total) as pop_bachelors_plus_in_buffer200,
    geom
    FROM subq1
)
SELECT 
ogc_fid,
name,
line,
SUM(pop_total_in_buffer500)::int as pop_total_in_buffer500,
SUM(pop_total_in_buffer200)::int as pop_total_in_buffer200,
SUM(pop_bachelors_plus_in_buffer500)::int as pop_bachelors_plus_in_buffer500,
SUM(pop_bachelors_plus_in_buffer200)::int as pop_bachelors_plus_in_buffer200,
geom
FROM subq2
GROUP BY ogc_fid, name, line, geom
ORDER BY pop_bachelors_plus_in_buffer200 DESC;
 * postgresql+psycopg2://docker:***@0.0.0.0:25432/restaurants
90 rows affected.
Returning data to local variable subway_buffer_pop
subway_buffer_pop.DataFrame()
ogc_fid name line pop_total_in_buffer500 pop_total_in_buffer200 pop_bachelors_plus_in_buffer500 pop_bachelors_plus_in_buffer200 geom
0 164 75th Ave E-F 10108 2848 7206 2068 0104000020E6100000010000000101000000439B652890...
1 67 67th Ave E-M-R 19616 3407 11493 2065 0104000020E61000000100000001010000001E15244495...
2 37 Elmhurst Ave E-M-R 21058 5619 5906 1802 0104000020E61000000100000001010000008DA1DD4173...
3 79 Broadway N-W 16821 2647 9448 1550 0104000020E610000001000000010100000074DC1BAF40...
4 294 40th St 7 13959 3167 6179 1440 0104000020E6100000010000000101000000359FFF1323...
... ... ... ... ... ... ... ... ...
85 192 Grant Ave A-S 1940 0 349 0 0104000020E6100000010000000101000000BD89ABFA5C...
86 23 Mets - Willets Point 7-7 Express 0 0 0 0 0104000020E6100000010000000101000000A33850B81E...
87 305 Jefferson St L 471 0 151 0 0104000020E6100000010000000101000000FC0BB00111...
88 352 Roosevelt Island - Main St F 1 0 1 0 0104000020E6100000010000000101000000AC5F5FCD01...
89 228 Cypress Hills J 107 0 25 0 0104000020E610000001000000010100000087F6F381E4...

90 rows × 8 columns

Apparently you can find the most educated people within 200 meters of a Queens subway station at 75th Ave!